Functions and Pivot HW!

https://github.com/rjmaitri/04_Bartolini_Bob_2020.git

library(dplyr)
library(tidyr)
library(reactable)
library(lubridate)
library(readr)

1. Write a function that takes a vector and returns one bootstrapped sample from said vector. Demonstrate that it works.

# One Bootsrap Sample ####
#function to sample from a vector and replace
bootstrap <- function(vec){
  
  one_boot<- sample(vec,
                   size = length(vec),
                   replace = TRUE)
  
  return(one_boot)
}
#output with a random vector
bootstrap(c(1,4,5))
## [1] 4 1 4

2. Write a function that given a vector of values, a request for some number of bootstraps (let’s call the parameter R), and a sample statistic function (e.g., mean, IQR, etc.) returns R number of values of that statistic. Have it default to R = 1000 and the function is mean. Show this works for 10 bootstrapped replicate draws of a mean from some vector. Do the values look reasonable? Compare to the actual mean of the vector. Make sure you are using the function(s) you wrote in #1

#input vec, R(#of bootstraps), mean
#default to 1k= R
#function from part 1
bootstrap <- function(vec){
  
  one_boot<- sample(vec,
                   size = length(vec),
                   replace = TRUE)
  
  return(one_boot)
}
#input vec, R(#of bootstraps), mean
#default to 1k= R
boot_mean <- function(vec, R = 1000, fun = mean) {
out <- replicate(R, bootstrap(vec))
fun(out)
}
#input vector into mean function
boot_mean(c(3,4,5,3,2,50),10)
## [1] 10.51667
#compare to actual mean of vector
mean(c(3,4,5,3,2))
## [1] 3.4
#output the dataframe with sample size
data.frame(R = 1:10) %>%
  rowwise(R) %>%
  summarize(boot_mean(c(3,4,5,3,2,50),10, fun = mean)) 
R boot_mean(c(3, 4, 5, 3, 2, 50), 10, fun = mean)
1 14.966667
2 12.533333
3 9.733333
4 7.833333
5 15.100000
6 10.300000
7 8.816667
8 16.000000
9 9.400000
10 13.483333

3. Write a function that, given a vector of values, a request for some number of bootstraps, and a sample statistic function, returns the original value of the statistic as applied to the vector, the mean of the statistic generated by the bootstrapped reps, the upper and lower 95% CI of the bootstrapped statistic (e.g., the 0.025 and 0.975 quantile), and the bias (i.e., the original value of the statistic - the mean of the bootstrapped statistic).

##write a function to sample from a vector
bootstrap <- function(vec){
  
  one_boot<- sample(vec,
                   size = length(vec),
                   replace = TRUE)
  
  return(one_boot)
}
#function that takes bootstrap samples and produces a mean
boot_mean <- function(vec, R = 1000, fun = mean) {
out <- replicate(R, bootstrap(vec))
fun(out)
}
#test to see that it works
boot_mean(c(3,4,5,3,2,50),10)
## [1] 11.16667
#compare to actual mean of vector
mean(c(3,4,5,3,2))
## [1] 3.4
#Write a function that takes the replicated bootstraps and produces statistics
stats_Bootsfunc <- function(vec, R = 1000, fun = mean){
vals <- replicate(R, bootstrap(vec))
#Statistics for data frame 
bstraps_mean <- mean(vals)
mean_vec <- mean(vec)
firstquant <- quantile(vals,0.025)
thirdquant <- quantile(vals,0.975)
bias <- mean(vec) - mean(vals)
#arrange dataframe with statistics  
out <- data.frame(mean_vec = mean_vec,  
        mean_samp = bstraps_mean,
    firstquantile = firstquant,
    thirdquantile = thirdquant,
    bias = bias)
return(out)
}
#output the dataframe with the function
stats_Bootsfunc(c(4,5,2,2,3,2,4,16,8,9,9,19,8,32,32,32,32,41,4,8,4,5),1)
mean_vec mean_samp firstquantile thirdquantile bias
2.5% 12.77273 13.68182 2 36.275 -0.9090909
reactable(data.frame(R = 1:100) %>%
  rowwise(R) %>%
  summarize(stats_Bootsfunc(c(3,4,5,5,5,6,5,10,4,5,4,4,16,8,9,9,19,18,20,21,22,4,8,4,5,5,6,5,6,4),R=100, fun = mean))) 

4. FiveThirtyEight keeps a great archive of poll data at https://projects.fivethirtyeight.com/polls/. The presidential general election polling data is freely available at https://projects.fivethirtyeight.com/polls-page/president_polls.csv with question, poll id, and cycle defining a unique poll.

4a. Download and look at the data. Is it long or wide?

library(readr)
pres_poll <- read_csv("president_polls.csv")

reactable(pres_poll, resizable = TRUE, wrap = FALSE, bordered = TRUE)

The presidential polls dataset is long, as it has two rows dedicated to each polling question.

4b. Get just the polling data for this last week (from 9/29 to today). Filter on start_date. Also filter down to just Biden and Trump (see candidate_name or answer). Extra credit for using {lubridate} for this, but you can just do a messy %in% string match.

class(pres_poll$start_date)
## [1] "character"
pres_poll$start_date<- as.Date(pres_poll$start_date, format = "%m/%d/%Y")
presPoll_filter <- filter(pres_poll, answer == "Biden" | answer == "Trump")
  
pres_current <- presPoll_filter[presPoll_filter$start_date >= "2020-09-29" & presPoll_filter$start_date <= "2020-10-10",]

reactable(pres_current, resizable = TRUE, wrap = FALSE, bordered = TRUE)

4c. OK, this is your sample. What’s the bootstrapped average percentage for each candidate for nationwide polls (state == "")? Note, this answer will not match 538 given their weighting by poll trustworthiness.

#filter by president and nationwide
Boot_pct <- pres_current %>% 
  select(state, answer, pct)
#replace NA's with zeros to keep dpylr happy
vec_3 <- replace(Boot_pct$state, is.na(Boot_pct$state), 0)
data <- data.frame(Nationwide = c(vec_3),
                   Candidate = c(Boot_pct$answer),
                   pct = Boot_pct$pct)
#Trump & National Pct filter for bootstrap
trumpbootdata <- data %>%
  filter(Candidate == "Trump" & Nationwide == 0)


Donald_boot_mean <- sample(trumpbootdata$pct, 
                             size = length(trumpbootdata$pct),
                             replace = TRUE) %>% mean

The bootstraped national average for Trump is 41.455

#Biden & National Pct filter for bootstrap
Bidenbootdata <- data %>%
  filter(Candidate == "Biden" & Nationwide == 0)


Biden_boot_mean <- sample(Bidenbootdata$pct, 
                             size = length(Bidenbootdata$pct),
                             replace = TRUE) %>% mean

The bootstraped national average for Biden is 50.7191071

4d. What is the average difference between the two candidates by state and national polls? Note, you’ll need to make this a wide data frame to answer! And, well, try the pivot without this advice first, but then….

library(tidyr)
poll_wide <- pres_current %>%
  pivot_wider(names_from = state,
              values_from = pct)
reactable(poll_wide, resizable = TRUE, wrap = FALSE, bordered = TRUE)

Make a unique ID by pasting together the question_id, poll_id, and state. Then select the ID, state, answer, and pct. Also filter out NA diffs

#Create new column with state, question_id and poll_id
Tidy_polls <- pres_current %>%
mutate(unique_ID = paste(state, question_id, poll_id, sep = "_"))

#create a df pivoted wider with columns for each state
Wide_poll <- Tidy_polls %>% 
  select(unique_ID, state, pct, answer) %>%
  pivot_wider(names_from = state,
              values_from = pct) 

reactable(Wide_poll, resizable = TRUE, wrap = FALSE, bordered = TRUE)
#mean for Biden/Trump for each state
meanpct <- 
  aggregate(Wide_poll[, 3:25], list(Wide_poll$answer), mean, na.rm = TRUE)

reactable(meanpct, resizable = TRUE, wrap = FALSE, bordered = TRUE)
#function to display the mean difference for each state
poll_diff <- function(pct) {
  
  result <- (pct - lag(pct))
}
Poll_Diff <- meanpct %>% 
  select_if(is.numeric) %>% 
  mutate_all(funs(difference = poll_diff(.)))%>% 
  head()

reactable(Poll_Diff[,24:46], resizable = TRUE, wrap = FALSE, bordered = TRUE)

5. replicate() has been our friend, but we’ve always had to be a little hacky with it. We’ve either had to fold in means, or use tricksy functions like colMeans and the like.

BUT - what’s interesting about replicate() is that, if you ask it to turn back raw draws from a random number generator - or anything with more than one value - it gives you a matrix or array.

5a. So, I want you to, using the mean and SD of Biden’s national polling average (you’ll need to calculate it!) from above, simulate 1000 draws from that population with a sample size of 50. What are the dimensions of the object. What are in the rows and columns?

The mean for Biden is 50.0705357 with a standard deviation of 3.4386823

#write a function to sample from Biden national polling avg
BidenUSAsims <- function(pct) {
Hopes <-  replicate(1000, sample(Bidenbootdata$pct,
                                  size = length(50),
                                                replace = TRUE))
return(Hopes)
}
BidenSims <- BidenUSAsims()

BidenSims
##    [1] 52.00 47.30 48.60 45.00 51.00 51.00 49.00 50.00 44.00 52.00 52.00 45.00
##   [13] 52.00 48.60 52.00 52.00 53.00 52.00 51.00 52.00 51.00 52.00 51.00 54.88
##   [25] 51.00 52.00 52.00 53.00 52.00 53.00 47.30 49.00 49.00 51.00 45.00 45.00
##   [37] 43.00 52.00 49.00 49.00 54.88 52.00 45.00 45.00 52.00 51.00 53.00 44.00
##   [49] 53.00 48.90 51.00 52.00 45.00 54.88 51.00 48.00 49.00 54.00 52.00 53.00
##   [61] 52.00 53.00 52.00 44.00 52.00 46.00 48.00 57.00 46.00 48.00 51.00 46.00
##   [73] 51.91 56.00 53.00 51.00 48.90 44.00 56.00 46.00 51.00 50.00 48.00 56.20
##   [85] 52.00 49.00 51.91 50.00 48.00 45.00 51.00 49.00 45.00 53.00 52.00 48.00
##   [97] 49.00 47.30 52.00 53.00 52.00 44.00 47.00 51.91 45.00 43.00 52.00 52.00
##  [109] 51.00 51.00 51.91 49.00 52.00 45.00 54.00 56.20 53.00 49.00 53.00 54.00
##  [121] 47.00 45.00 45.00 47.00 52.00 52.00 51.00 51.00 47.00 51.00 48.00 45.00
##  [133] 44.00 43.00 45.00 45.00 54.88 56.20 51.00 51.00 49.00 51.00 44.00 48.00
##  [145] 53.00 47.00 54.00 47.00 56.00 52.00 52.00 51.00 50.00 48.00 52.00 47.00
##  [157] 52.00 51.00 57.00 49.00 45.00 49.00 47.00 54.00 45.00 52.00 51.00 45.00
##  [169] 52.00 57.00 46.00 52.00 43.00 52.00 48.90 48.60 53.00 53.00 48.00 51.00
##  [181] 43.00 56.20 51.00 48.90 45.00 53.00 50.00 45.00 52.00 52.00 51.00 51.00
##  [193] 52.00 47.00 51.91 45.00 47.00 57.16 48.00 53.00 46.00 52.00 52.00 57.16
##  [205] 51.00 53.00 43.00 46.00 54.88 54.00 56.20 43.00 52.00 45.00 57.00 52.00
##  [217] 47.30 48.90 47.00 51.00 52.00 54.88 45.00 53.00 54.88 49.00 44.00 52.00
##  [229] 52.00 52.00 51.00 53.00 49.00 51.00 45.00 53.00 48.90 48.60 52.00 52.00
##  [241] 49.00 43.00 51.00 49.00 51.00 48.00 51.00 51.00 47.00 48.60 49.00 47.00
##  [253] 45.00 57.16 47.00 57.16 53.00 51.91 56.20 52.00 51.00 47.30 52.00 53.00
##  [265] 47.00 51.00 51.00 52.00 52.00 54.88 48.00 45.00 47.00 53.00 52.00 47.00
##  [277] 44.00 54.88 53.00 48.60 48.90 52.00 51.00 45.00 51.00 54.00 52.00 43.00
##  [289] 52.00 53.00 46.00 51.00 49.00 47.00 52.00 43.00 45.00 52.00 51.00 48.00
##  [301] 48.90 52.00 53.00 51.00 48.00 47.00 50.00 52.00 51.00 49.00 48.00 52.00
##  [313] 47.30 50.00 43.00 48.00 52.00 56.20 48.00 52.00 50.00 52.00 47.00 47.00
##  [325] 51.00 56.20 53.00 45.00 50.00 51.00 43.00 48.90 53.00 52.00 52.00 56.20
##  [337] 51.91 53.00 49.00 49.00 48.00 50.00 51.00 50.00 49.00 51.00 53.00 52.00
##  [349] 51.91 52.00 48.90 52.00 43.00 52.00 51.00 47.00 48.00 54.00 48.60 47.00
##  [361] 52.00 53.00 49.00 52.00 52.00 51.00 46.00 45.00 50.00 51.00 48.00 45.00
##  [373] 54.88 50.00 47.00 52.00 49.00 52.00 52.00 45.00 57.16 45.00 51.00 51.00
##  [385] 49.00 50.00 51.00 45.00 51.00 54.00 51.91 52.00 44.00 52.00 47.00 49.00
##  [397] 48.00 51.91 53.00 52.00 51.00 48.00 52.00 51.00 47.00 51.00 51.00 56.20
##  [409] 52.00 48.00 51.00 52.00 53.00 47.00 50.00 48.00 52.00 51.00 51.00 51.00
##  [421] 57.16 49.00 49.00 49.00 57.00 51.00 53.00 51.00 57.00 51.00 51.00 44.00
##  [433] 44.00 45.00 51.00 47.00 51.00 52.00 44.00 51.00 51.00 51.00 51.00 48.60
##  [445] 53.00 52.00 52.00 51.00 51.00 48.00 52.00 45.00 45.00 51.00 48.60 53.00
##  [457] 52.00 47.00 48.00 52.00 52.00 52.00 52.00 51.00 53.00 51.00 45.00 53.00
##  [469] 52.00 43.00 45.00 57.00 53.00 49.00 43.00 44.00 51.00 44.00 57.00 52.00
##  [481] 52.00 43.00 45.00 52.00 50.00 45.00 56.00 51.00 51.00 51.00 50.00 44.00
##  [493] 51.00 53.00 51.00 51.00 51.00 54.00 49.00 52.00 51.00 48.00 49.00 54.88
##  [505] 48.90 51.91 45.00 53.00 51.00 52.00 57.00 51.00 52.00 48.00 52.00 48.00
##  [517] 53.00 52.00 53.00 48.00 52.00 47.30 43.00 52.00 51.00 49.00 50.00 51.00
##  [529] 52.00 51.00 52.00 51.00 51.00 44.00 44.00 52.00 54.88 52.00 57.00 49.00
##  [541] 45.00 47.00 51.91 51.00 51.00 54.00 52.00 54.00 52.00 53.00 51.00 52.00
##  [553] 44.00 53.00 45.00 47.30 51.00 52.00 45.00 48.00 53.00 47.30 45.00 51.00
##  [565] 54.00 53.00 48.00 57.16 53.00 51.00 49.00 52.00 54.88 43.00 52.00 56.00
##  [577] 52.00 52.00 52.00 48.00 52.00 51.00 48.00 48.00 48.00 57.00 51.00 53.00
##  [589] 52.00 51.00 52.00 47.30 45.00 48.00 48.00 57.16 57.00 52.00 53.00 50.00
##  [601] 52.00 48.90 48.00 53.00 51.00 44.00 47.00 51.00 53.00 52.00 53.00 48.00
##  [613] 52.00 47.00 51.00 51.00 56.20 50.00 51.00 54.00 51.00 54.00 45.00 47.00
##  [625] 52.00 53.00 51.00 44.00 52.00 56.00 52.00 51.00 51.00 50.00 51.00 52.00
##  [637] 47.30 52.00 52.00 52.00 44.00 51.00 53.00 49.00 52.00 52.00 45.00 43.00
##  [649] 56.20 47.30 52.00 51.00 51.00 57.16 48.00 47.00 48.00 51.00 57.16 49.00
##  [661] 48.90 51.00 49.00 52.00 51.00 57.16 45.00 57.16 44.00 53.00 52.00 48.00
##  [673] 51.00 47.00 48.60 44.00 53.00 57.00 52.00 52.00 54.88 43.00 47.00 43.00
##  [685] 43.00 48.00 49.00 56.20 44.00 52.00 52.00 52.00 47.00 48.00 51.00 51.00
##  [697] 52.00 45.00 52.00 48.90 52.00 48.00 48.60 51.00 51.00 47.00 44.00 47.00
##  [709] 50.00 52.00 51.00 51.00 52.00 56.20 48.00 48.60 53.00 52.00 52.00 51.00
##  [721] 49.00 48.90 54.88 48.00 47.00 53.00 50.00 52.00 51.00 53.00 52.00 45.00
##  [733] 52.00 45.00 51.00 43.00 45.00 45.00 53.00 52.00 51.00 52.00 52.00 43.00
##  [745] 51.00 48.00 51.91 48.00 44.00 51.00 44.00 53.00 53.00 48.60 52.00 46.00
##  [757] 57.00 52.00 56.20 53.00 49.00 48.00 51.00 48.00 52.00 48.00 52.00 52.00
##  [769] 49.00 53.00 54.00 45.00 47.00 49.00 50.00 51.00 53.00 53.00 51.00 52.00
##  [781] 48.00 47.30 51.00 51.00 51.00 47.00 48.60 51.00 52.00 51.00 51.00 51.00
##  [793] 48.90 45.00 51.00 51.00 53.00 52.00 48.90 48.60 47.00 44.00 56.00 48.00
##  [805] 45.00 44.00 51.00 52.00 52.00 52.00 47.00 53.00 52.00 48.90 51.00 45.00
##  [817] 52.00 52.00 46.00 48.00 51.00 52.00 45.00 44.00 53.00 52.00 47.00 49.00
##  [829] 49.00 56.00 51.00 48.90 52.00 51.00 48.60 51.00 51.91 53.00 54.00 54.88
##  [841] 45.00 45.00 53.00 52.00 51.00 56.00 57.00 54.00 44.00 47.00 48.00 52.00
##  [853] 56.20 48.00 48.00 57.16 52.00 48.00 56.00 57.00 52.00 52.00 43.00 54.88
##  [865] 53.00 57.00 52.00 52.00 57.00 51.00 48.00 43.00 51.00 52.00 47.00 49.00
##  [877] 54.00 48.00 51.00 48.00 51.00 51.00 47.00 52.00 46.00 52.00 49.00 50.00
##  [889] 44.00 51.00 51.00 44.00 57.00 51.00 51.00 50.00 53.00 46.00 57.00 51.00
##  [901] 52.00 53.00 50.00 46.00 46.00 52.00 52.00 51.00 52.00 44.00 52.00 51.00
##  [913] 50.00 52.00 51.00 49.00 44.00 49.00 51.00 52.00 53.00 43.00 53.00 45.00
##  [925] 52.00 49.00 50.00 48.00 53.00 48.00 47.30 43.00 51.00 51.00 54.00 53.00
##  [937] 47.00 52.00 51.00 51.00 47.30 54.00 45.00 47.30 45.00 44.00 51.00 45.00
##  [949] 53.00 54.88 53.00 52.00 57.00 51.00 51.00 45.00 53.00 52.00 52.00 52.00
##  [961] 53.00 54.00 49.00 51.00 49.00 56.00 49.00 52.00 57.16 52.00 49.00 52.00
##  [973] 43.00 49.00 57.16 47.00 53.00 51.00 52.00 43.00 46.00 47.00 52.00 53.00
##  [985] 44.00 51.00 56.20 50.00 51.91 57.00 51.00 52.00 52.00 54.00 47.00 48.90
##  [997] 48.00 53.00 51.00 47.00

5b. Yuck. Can you turn this into something usable? Say, first make it a tibble or data frame, and then pivot it to long, such that you end up with a column that has an identifier for sim and a column with a single value from that sim? (Oh, and for all columns, cols = everything())

#errorneous attempt ####
#convert to data frame and pivot
longBiden <- as.data.frame(BidenSims) %>%
  pivot_longer(cols = everything(),
               names_to = "sim_ID",
               values_to= "pct")

reactable(longBiden, resizable = TRUE, wrap = FALSE, bordered = TRUE)